{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Update these variables according to your requirement\n",
    "keyword = \"Leadership Chinese SOEs\" # the double quote will look for the exact keyword,\n",
    "                                            # the simple quote will also look for similar keywords\n",
    "number_of_results = 100 # number of results to look for on Google Scholar\n",
    "start_year = 2017 # start range for search\n",
    "end_year = 2018 # end range for search\n",
    "save_database = False # choose if you would like to save the database to .csv (recommended to correctly visualize the URLs)\n",
    "path = 'csv/'+keyword+'_'+str(start_year)+'_'+str(end_year)+'.csv' # path to save the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def get_citations(content):\n",
    "    out = 0\n",
    "    for char in range(0,len(content)):\n",
    "        if content[char:char+9] == 'Cited by ':\n",
    "            init = char+9                          \n",
    "            for end in range(init+1,init+6):\n",
    "                if content[end] == '<':\n",
    "                    break\n",
    "            out = content[init:end]\n",
    "    return int(out)\n",
    "    \n",
    "def get_year(content):\n",
    "    for char in range(0,len(content)):\n",
    "        if content[char] == '-':\n",
    "            out = content[char-5:char-1]\n",
    "    if not out.isdigit():\n",
    "        out = 0\n",
    "    return int(out)\n",
    "\n",
    "def get_author(content):\n",
    "    for char in range(0,len(content)):\n",
    "        if content[char] == '-':\n",
    "            out = content[2:char-1]\n",
    "            break\n",
    "    return out\n",
    "\n",
    "# Start new session\n",
    "session = requests.Session()\n",
    "\n",
    "# Variables\n",
    "links = list()\n",
    "title = list()\n",
    "citations = list()\n",
    "year = list()\n",
    "rank = list()\n",
    "author = list()\n",
    "rank.append(0) # initialization necessary for incremental purposes\n",
    "\n",
    "# Get content \n",
    "for n in range(0, number_of_results, 10):    \n",
    "    url = 'https://scholar.google.com/scholar?start='+str(n)+'&q='+keyword.replace(' ','+')+'&as_ylo='+str(start_year)+'&as_yhi='\
    +str(end_year)\n",
    "    page = session.get(url)\n",
    "    c = page.content\n",
    "#     print(c)\n",
    "    # Create parser\n",
    "    soup = BeautifulSoup(c, 'html.parser')\n",
    "    \n",
    "    # Get stuff\n",
    "    mydivs = soup.findAll(\"div\", { \"class\" : \"gs_r\" })\n",
    "    \n",
    "    for div in mydivs:\n",
    "        try:\n",
    "            links.append(div.find('h3').find('a').get('href'))\n",
    "        except: # catch *all* exceptions\n",
    "            links.append('Look manually at: https://scholar.google.com/scholar?start='\\\n",
    "                         +str(n)+'&q'+keyword.replace(' ','+'))\n",
    "        \n",
    "        try:\n",
    "            title.append(div.find('h3').find('a').text)\n",
    "        except: \n",
    "            title.append('Could not catch title')\n",
    "            \n",
    "        citations.append(get_citations(str(div.format_string)))\n",
    "        year.append(get_year(div.find('div',{'class' : 'gs_a'}).text))\n",
    "        author.append(get_author(div.find('div',{'class' : 'gs_a'}).text))\n",
    "        rank.append(rank[-1]+1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_database=True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done!\n"
     ]
    }
   ],
   "source": [
    "# Create a dataset and sort by the number of citations\n",
    "data = pd.DataFrame(list(zip(author, title, citations, year, links)), index = rank[1:], columns=['Author', 'Title', 'Citations', 'Year', 'Source'])\n",
    "data = data.rename_axis('Rank', axis=\"columns\")\n",
    "\n",
    "data_ranked = data.sort_values('Citations', ascending=False)\n",
    "\n",
    "# Save results\n",
    "if save_database:\n",
    "    data_ranked.to_csv(path, encoding='utf-8')\n",
    "\n",
    "print('Done!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
